{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Selenium库是一个自动化测试工具,用于网络爬虫中来应对JavaScript渲染的页面的抓取.\n",
    "# chromium是开源的,chromium由不同深度的蓝色构成.chrome由蓝红绿黄四种颜色组成.\n",
    "# chromium是开发版. chrome是正式版.\n",
    "# Pyppeteer是基于Python的新特性async实现,它的一些执行支持异步操作.\n",
    "# 安装: pip install pyppeteer\n",
    "\n",
    "# pyppeteer详细用法: https://miyakogi.github.io/pyppeteer/reference.html\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quotes: 0\n"
     ]
    }
   ],
   "source": [
    "import pyppeteer\n",
    "\n",
    "# 测试基本的页面渲染操作,选用网址: http://quotes.toscrape.com/js/\n",
    "# 这个页面由JavaScript渲染而成,用requests库请求得到的HTML结果里面不包含页面中所见的条目内容的.\n",
    "# requests无法完成正常抓取,测试代码如下:\n",
    "import requests\n",
    "from pyquery import PyQuery as pq \n",
    "\n",
    "url = 'http://quotes.toscrape.com/js/'\n",
    "response = requests.get(url)\n",
    "doc = pq(response.text)\n",
    "print('Quotes:',doc('.quotes').length)\n",
    "\"\"\"\n",
    "说明: 使用requests请求网页内容,使用pyquery解析页面中的每一个条目. \n",
    "观察源码后,发现每个条目中的class名为quote,选用.quote这个CSS选择器来选择,最终输出条目数量.\n",
    "\n",
    "输出结果为0. \n",
    "\n",
    "这个页面是由JavaScript渲染而成的,所看到的内容都是网页加载后又执行了JavaScript之后才呈现出来的.\n",
    "因此这些条目数据不存在于原始HTML代码中,而requests仅抓取的是原始html代码.\n",
    "\n",
    "遇到这种类型的网站解决办法有以下三种:\n",
    "1. 分析网页源代码数据,数据被隐藏在HTML中,以JavaScript变量的形式存在,直接提取.\n",
    "2. 分析Ajax,很多数据是经过Ajax请求时获取的.可以分析其接口.\n",
    "3. 模拟JavaScript渲染过程,直接抓取渲染后结果.  Pyppeteer和Selenium用的是第三种方法.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "This event loop is already running",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-8-c2c4416241c8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     11\u001b[0m     \u001b[0mawait\u001b[0m \u001b[0mbrowser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_event_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[1;34m(self, future)\u001b[0m\n\u001b[0;32m    453\u001b[0m         \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_run_until_complete_cb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    454\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 455\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_forever\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    456\u001b[0m         \u001b[1;32mexcept\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    457\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mnew_task\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcancelled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_forever\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    407\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_closed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    408\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_running\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 409\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'This event loop is already running'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    410\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mevents\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_running_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    411\u001b[0m             raise RuntimeError(\n",
      "\u001b[1;31mRuntimeError\u001b[0m: This event loop is already running"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Quotes: 10\n",
      "Quotes: 10\n"
     ]
    }
   ],
   "source": [
    "import asyncio\n",
    "from pyppeteer import launch \n",
    "from pyquery import PyQuery as pq \n",
    "\n",
    "async def main():\n",
    "    browser = await launch()\n",
    "    page = await browser.newPage()\n",
    "    await page.goto('http://quotes.toscrape.com/js/')\n",
    "    doc = pq(await page.content())\n",
    "    print('Quotes:',doc('.quote').length)\n",
    "    await browser.close()\n",
    "    \n",
    "asyncio.get_event_loop().run_until_complete(main())\n",
    "\"\"\"\n",
    "成功匹配出来class为quote的条目, 总数为10条.\n",
    "Pyppeteer整个流程完成了浏览器开启,新建页面,页面加载等操作. \n",
    "Pyppeteer里面进行了异步操作,需要配合async/await关键词来实现.\n",
    "\n",
    "首先,launch方法会新建一个Browser对象,然后赋值给browser,\n",
    "然后调用newPage方法相当于浏览器中新建了一个选项卡,同时新建了一个Page对象。\n",
    "然后Page对象调用了goto方法就相当于在浏览器中输入了这个URL,浏览器跳转到了对应的页面进行加载,\n",
    "加载完成之后再调用content方法,返回当前浏览器页面的源代码。然后进一步地,\n",
    "我们用pyquery进行同样地解析,就可以得到JavaScript渲染的结果了.\n",
    "\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "This event loop is already running",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-11-ba527c73b57c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     20\u001b[0m     \u001b[0mawait\u001b[0m \u001b[0mbrowser\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     21\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_event_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[1;34m(self, future)\u001b[0m\n\u001b[0;32m    453\u001b[0m         \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_run_until_complete_cb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    454\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 455\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_forever\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    456\u001b[0m         \u001b[1;32mexcept\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    457\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mnew_task\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcancelled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_forever\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    407\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_closed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    408\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_running\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 409\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'This event loop is already running'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    410\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mevents\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_running_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    411\u001b[0m             raise RuntimeError(\n",
      "\u001b[1;31mRuntimeError\u001b[0m: This event loop is already running"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'width': 800, 'height': 600, 'deviceScaleFactor': 1}\n"
     ]
    }
   ],
   "source": [
    "# 模拟网页截图,保存PDF,执行自定义的JavaScript获得特定的内容\n",
    "import asyncio\n",
    "from pyppeteer import launch \n",
    "\n",
    "async def main():\n",
    "    browser = await launch()\n",
    "    page = await browser.newPage()\n",
    "    await page.goto('http://quotes.toscrape.com/js/')\n",
    "#     await page.screenshot(page='quotesscrape.png')\n",
    "    await page.pdf(path='quotesscrape.pdf')\n",
    "    dimensions = await page.evaluate('''() => {\n",
    "        return {\n",
    "            width: document.documentElement.clientWidth,\n",
    "            height: document.documentElement.clientHeight,\n",
    "            deviceScaleFactor: window.devicePixelRatio,\n",
    "        }\n",
    "    }''')\n",
    "    \n",
    "    print(dimensions)\n",
    "    await browser.close()\n",
    "    \n",
    "asyncio.get_event_loop().run_until_complete(main())\n",
    "\"\"\"\n",
    "网页截图保存,网页导出pdf保存,执行 JavaScript 并返回对应数据。\n",
    "  screenshot 方法可以传入保存的图片路径，另外还可以指定保存格式 type、\n",
    " 清晰度 quality、是否全屏 fullPage、裁切 clip 等各个参数实现截图。\n",
    " \n",
    " 其内容也是 JavaScript 渲染后的内容，\n",
    " 另外这个方法还可以指定放缩大小 scale、页码范围 pageRanges、\n",
    " 宽高 width 和 height、方向 landscape 等等参数\n",
    " \n",
    " 最后我们又调用了 evaluate 方法执行了一些 JavaScript，\n",
    " JavaScript 传入的是一个函数，\n",
    " 使用 return 方法返回了网页的宽高、像素大小比率三个值，\n",
    " 最后得到的是一个 JSON 格式的对象.\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "This event loop is already running",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-12-8a301713ac93>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      8\u001b[0m     \u001b[0mawait\u001b[0m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_event_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[1;34m(self, future)\u001b[0m\n\u001b[0;32m    453\u001b[0m         \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_run_until_complete_cb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    454\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 455\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_forever\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    456\u001b[0m         \u001b[1;32mexcept\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    457\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mnew_task\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcancelled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_forever\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    407\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_closed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    408\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_running\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 409\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'This event loop is already running'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    410\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mevents\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_running_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    411\u001b[0m             raise RuntimeError(\n",
      "\u001b[1;31mRuntimeError\u001b[0m: This event loop is already running"
     ]
    }
   ],
   "source": [
    "\n",
    "# 关闭headless模式.\n",
    "import asyncio \n",
    "from pyppeteer import launch \n",
    "\n",
    "async def main():\n",
    "    await launch(headless=False)\n",
    "    await asyncio.sleep(100)\n",
    "    \n",
    "asyncio.get_event_loop().run_until_complete(main())\n",
    "# 出现一个空白的Chromium界面."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "This event loop is already running",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-13-479aeff7937c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     11\u001b[0m     \u001b[0mawait\u001b[0m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_event_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[1;34m(self, future)\u001b[0m\n\u001b[0;32m    453\u001b[0m         \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_run_until_complete_cb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    454\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 455\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_forever\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    456\u001b[0m         \u001b[1;32mexcept\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    457\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mnew_task\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcancelled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_forever\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    407\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_closed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    408\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_running\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 409\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'This event loop is already running'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    410\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mevents\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_running_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    411\u001b[0m             raise RuntimeError(\n",
      "\u001b[1;31mRuntimeError\u001b[0m: This event loop is already running"
     ]
    }
   ],
   "source": [
    "# 开启调试模式,\n",
    "# 爬虫时需要分析网页结构还有网络请求.\n",
    "# 将devtools参数设置为True,每开启一个界面,就弹出一个调试窗口.\n",
    "import asyncio\n",
    "from pyppeteer import launch \n",
    "\n",
    "async def main():\n",
    "    browser = await launch(devtools=True)\n",
    "    page = await browser.newPage()\n",
    "    await page.goto('https://www.baidu.com')\n",
    "    await asyncio.sleep(100)\n",
    "    \n",
    "asyncio.get_event_loop().run_until_complete(main())\n",
    "# devtools这个参数设置为True,headless就会被关闭.界面会展示出来."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "ename": "RuntimeError",
     "evalue": "This event loop is already running",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-14-388b7f7f7d45>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     10\u001b[0m     \u001b[0mawait\u001b[0m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 12\u001b[1;33m \u001b[0masyncio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_event_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_until_complete\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_until_complete\u001b[1;34m(self, future)\u001b[0m\n\u001b[0;32m    453\u001b[0m         \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd_done_callback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_run_until_complete_cb\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    454\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 455\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrun_forever\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    456\u001b[0m         \u001b[1;32mexcept\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    457\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mnew_task\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdone\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfuture\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcancelled\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32mg:\\itinstall\\python36\\Lib\\asyncio\\base_events.py\u001b[0m in \u001b[0;36mrun_forever\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    407\u001b[0m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_check_closed\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    408\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_running\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 409\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mRuntimeError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'This event loop is already running'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    410\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mevents\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_get_running_loop\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    411\u001b[0m             raise RuntimeError(\n",
      "\u001b[1;31mRuntimeError\u001b[0m: This event loop is already running"
     ]
    }
   ],
   "source": [
    "# 把\"Chrome正受到自动测试软件的控制\" 的提示去掉.\n",
    "# 打开淘宝页面\n",
    "import asyncio\n",
    "from pyppeteer import launch \n",
    "\n",
    "async def main():\n",
    "    browser = await launch(headless=False,args=['--disable-infobars'])\n",
    "    page = await browser.newPage()\n",
    "    await page.goto('https://www.taobao.com')\n",
    "    await asyncio.sleep(100)\n",
    "    \n",
    "asyncio.get_event_loop().run_until_complete(main())\n",
    "# 出现了显示的bug,整个浏览器窗口比显示的内容窗口要大."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "已经运行中\n"
     ]
    }
   ],
   "source": [
    "# 解决显示的bug,需要设置下window-size还有viewport\n",
    "import asyncio\n",
    "from pyppeteer import launch \n",
    "\n",
    "width,height = 1366,768 \n",
    "\n",
    "async def main():\n",
    "    browser = await launch(headless=False,args=['--disable-infobars',f'--window-size={width},{height}'])\n",
    "    page = await browser.newPage()\n",
    "    await page.setViewport({'width':width,'height':height})\n",
    "    await page.goto('https://www.taobao.com')\n",
    "    await asyncio.sleep(100)\n",
    "\n",
    "try:\n",
    "    asyncio.get_event_loop().run_until_complete(main())\n",
    "except RuntimeError:\n",
    "    print('已经运行中')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "已经运行中\n"
     ]
    }
   ],
   "source": [
    "# 淘宝主要通过window.navigator.webdriver来对webdriver进行检测，\n",
    "# 所以我们只需要使用JavaScript将它设置为false即可.\n",
    "import asyncio \n",
    "from pyppeteer import launch\n",
    "\n",
    "width,height = 1366,768 \n",
    "\n",
    "async def main():\n",
    "    browser = await launch(headless=False,args=['--disable-infobars',f'--window-size={width},{height}'])\n",
    "    page = await browser.newPage()\n",
    "    await page.setViewport({'width':width,'height':height})\n",
    "    await page.goto('https://login.taobao.com/member/login.jhtml?redirectURL=https://www.taobao.com/')\n",
    "    await page.evaluate(\n",
    "        '''() => { Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')\n",
    "    await asyncio.sleep(100)\n",
    "    \n",
    "try:\n",
    "    asyncio.get_event_loop().run_until_complete(main())\n",
    "except RuntimeError:\n",
    "    print('已经运行中')\n",
    "    \n",
    "\"\"\"\n",
    "后面要加入输入用户名和密码的代码。\n",
    "成功规避了webdriver的检测,%save用鼠标拖动模拟就可以登录了.\n",
    "\n",
    "免去淘宝登录方法:\n",
    "1.设置用户目录: \n",
    "  把关键Cookies保存到本地用户目录. 下次登录时,直接读取并保持登录状态.\n",
    "  用户目录下存的信息包括了浏览器的基本配置信息,Cache,Cookies等各种信息.\n",
    "  在浏览器启动时读取这些信息,启动时可以恢复一些历史记录和登录状态信息.\n",
    "  \n",
    "\"\"\"\n",
    "# browser = await launch(headless=False, userDataDir='./userdata', args=['--disable-infobars'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
